/*************************************************************************
READ_WHO.SAS

This program reads in the csv files downloaded from the WHO's Mortality
    Database. It is very important to read the documentation provided
    by the WHO.

Some countries used ICD9 codes throughout the sample period, and others
    switched to ICD10 during the sample period. Some countries provide
    data at a finer level of detail than others. To maximize coverage,
    I apply codes listed in column 1 of table 5 and column 1 of table 8
    of the WHO documentation.

Please refer to the documentation provided by the WHO on the specific disease
    codes, and the codes that are specific to certain countries.

This program requires three other files:
    icd9_code_fixes.code, which converts the ICD9 codes in detailed format
       to the basic tabulation list (which had better coverage).
    icd9_to_whocodes.code and icd10_to_whocodes.code, which assign
       a common "WHO code" to all ICD9 and ICD10s in the original data.

  The output is a dataset of country-disease-year observations.
    
*************************************************************************/

* Define the local directory in which you are working;
%LET LOCAL = /mnt/Research/RESTAT;
LIBNAME HERE "&LOCAL";

* Read in raw data;
    DATA MORTICD9;
    INFILE 'morticd9.csv' DELIMITER = ',' MISSOVER DSD LRECL=32767 FIRSTOBS=2 ;
       FORMAT ADMIN1 $1. ;
       FORMAT SUBDIV $1. ;
       FORMAT LIST $3. ;
       FORMAT CAUSE $4. ;
    INPUT
                COUNTRY
                ADMIN1 $
                SUBDIV $
                YEAR
                LIST $
                CAUSE $
                SEX
                FRMAT
                IM_FRMAT
                DEATHS1-DEATHS26
                IM_DEATHS1-IM_DEATHS4
    ;

    DATA MORTICD10;
    INFILE 'morticd10.csv' DELIMITER = ',' MISSOVER DSD LRECL=32767 FIRSTOBS=2 ;
       FORMAT ADMIN1 $1. ;
       FORMAT SUBDIV $1. ;
       FORMAT LIST $3. ;
       FORMAT CAUSE $4. ;
    INPUT
                COUNTRY
                ADMIN1 $
                SUBDIV $
                YEAR
                LIST $
                CAUSE $
                SEX
                FRMAT
                IM_FRMAT
                DEATHS1-DEATHS26
                IM_DEATHS1-IM_DEATHS4
    ;


DATA ICD9_DATA (KEEP=COUNTRY_CODE YEAR WHO_CODE DEATHS);
    SET MORTICD9;
    RENAME COUNTRY=COUNTRY_CODE;
  * Drop all deaths line;
    IF CAUSE NE 'B00';  
  * Restrict to country_code-level data;
    IF ADMIN1 EQ '';
  * This is the total over all age groups;
    DEATHS = DEATHS1;
  * Assign new disease coding;
    ICD9 = CAUSE;
    DROP CAUSE;
    %INCLUDE 'icd9_code_fixes.code';
    %INCLUDE 'icd9_to_whocodes.code';

DATA ICD10_DATA (KEEP=COUNTRY_CODE YEAR WHO_CODE DEATHS);
    SET MORTICD10;
    RENAME COUNTRY=COUNTRY_CODE;        
  * Drop all deaths line;
    IF CAUSE NE '1000'; 
    IF CAUSE NE 'AAA'; 
  * Restrict to country_code-level data;
    IF ADMIN1 EQ '';
  * This is the total over all age groups;
    DEATHS = DEATHS1;
  * Assign new disease coding;
    ICD10 = CAUSE;
    DROP CAUSE;
    DROP SUBDIV;    
    %INCLUDE 'icd10_to_whocodes.code';

* Combine ICD9 and ICD10;    
  DATA ICD_DATA;
    SET ICD10_DATA ICD9_DATA;
  * Drop non-drug related causes of death;
    IF WHO_CODE NOT IN('1094','1095','1096','1097','1098','1099','1100','1101','1102','1103');
* Sum over country-disease-year, across gender;
  PROC SORT DATA=ICD_DATA;
      BY COUNTRY_CODE WHO_CODE YEAR;
  PROC MEANS DATA=ICD_DATA (WHERE=(WHO_CODE NE '')) NOPRINT;
      BY COUNTRY_CODE WHO_CODE YEAR;
      OUTPUT OUT=ICD_DATA SUM(DEATHS)=DEATHS;
       
* Rectangularize so that there is one observation per country-year-disease;
  PROC SORT DATA=ICD_DATA (KEEP=COUNTRY_CODE) NODUPKEY OUT=COUNTRYLIST;
      BY COUNTRY_CODE;
  PROC SORT DATA=ICD_DATA (KEEP=WHO_CODE) NODUPKEY OUT=DISEASELIST;
      BY WHO_CODE;
  PROC SORT DATA=ICD_DATA (KEEP=YEAR) NODUPKEY OUT=YEARLIST;
      BY YEAR;
  PROC SQL;
      CREATE TABLE RECTANGLE AS
          SELECT COUNTRY_CODE, WHO_CODE, YEAR
          FROM COUNTRYLIST, DISEASELIST, YEARLIST
          ORDER BY COUNTRY_CODE, WHO_CODE, YEAR;
  PROC SORT DATA=ICD_DATA;
      BY COUNTRY_CODE WHO_CODE YEAR;
  DATA RECT_WHO (DROP=_TYPE_ _FREQ_);
      MERGE RECTANGLE ICD_DATA;
      BY COUNTRY_CODE WHO_CODE YEAR;
      
* Use multiple imputation to deal with missing values;
  PROC SORT DATA=RECT_WHO;
    BY COUNTRY_CODE WHO_CODE YEAR;
  PROC MI DATA=RECT_WHO OUT=HERE.WHO_DEATHS_I NIMPUTE=5
    MINIMUM= 0 1979;
    BY COUNTRY_CODE WHO_CODE;
    EM;
    VAR DEATHS YEAR;

  PROC MEANS DATA=RECT_WHO;
  PROC MEANS DATA=HERE.WHO_DEATHS_I;
